From 3a93e301ddc913758abe05c876aa3016e8b23af8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Mattias=20Engdeg=C3=A5rd?= Date: Tue, 13 Feb 2024 14:52:39 +0100 Subject: [PATCH] String hashing improvements (spread and performance) Fix gaps in hashing coverage in the middle and end of even fairly short strings. E.g., `outline-1`, `outline-2` etc all hashed to the exact same value but with the patch, there are no collisions among the ~160000 symbols in the Emacs tree. This change also improves average hashing speed by using fewer mixing operations. * src/fns.c (hash_string): Use unit stride for fairly short strings, while retaining the cap of 8 samples for long ones. Always hash the last word to ensure that the end of the string is covered. For strings shorter than a word, use fewer loads and a single reduction step. --- src/fns.c | 49 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/src/fns.c b/src/fns.c index 918ba0370e8..f94e8519957 100644 --- a/src/fns.c +++ b/src/fns.c @@ -5069,24 +5069,49 @@ hash_string (char const *ptr, ptrdiff_t len) EMACS_UINT hash = len; /* At most 8 steps. We could reuse SXHASH_MAX_LEN, of course, * but dividing by 8 is cheaper. */ - ptrdiff_t step = sizeof hash + ((end - p) >> 3); + ptrdiff_t step = max (sizeof hash, ((end - p) >> 3)); - while (p + sizeof hash <= end) + if (p + sizeof hash <= end) { + do + { + EMACS_UINT c; + /* We presume that the compiler will replace this `memcpy` with + a single load/move instruction when applicable. */ + memcpy (&c, p, sizeof hash); + p += step; + hash = sxhash_combine (hash, c); + } + while (p + sizeof hash <= end); + /* Hash the last wordful of bytes in the string, because that is + is often the part where strings differ. This may cause some + bytes to be hashed twice but we assume that's not a big problem. */ EMACS_UINT c; - /* We presume that the compiler will replace this `memcpy` with - a single load/move instruction when applicable. */ - memcpy (&c, p, sizeof hash); - p += step; + memcpy (&c, end - sizeof c, sizeof c); hash = sxhash_combine (hash, c); } - /* A few last bytes may remain (smaller than an EMACS_UINT). */ - /* FIXME: We could do this without a loop, but it'd require - endian-dependent code :-( */ - while (p < end) + else { - unsigned char c = *p++; - hash = sxhash_combine (hash, c); + /* String is shorter than an EMACS_UINT. Use smaller loads. */ + eassume (p <= end && end - p < sizeof (EMACS_UINT)); + EMACS_UINT tail = 0; + if (end - p >= 4) + { + uint32_t c; + memcpy (&c, p, sizeof c); + tail = (tail << (8 * sizeof c)) + c; + p += sizeof c; + } + if (end - p >= 2) + { + uint16_t c; + memcpy (&c, p, sizeof c); + tail = (tail << (8 * sizeof c)) + c; + p += sizeof c; + } + if (p < end) + tail = (tail << 8) + (unsigned char)*p; + hash = sxhash_combine (hash, tail); } return hash; -- 2.30.2